## Coding 4 Conservation 2022
## Introduction to Mixed Effects Models Part 2
## Emily Ruhs and David Klinges

rm(list=ls()) #clears your history and environment

library(lme4) #generalized linear model package used on Monday
library(lmerTest)
library(ggplot2) #ggplot visualization package. We will alternate between this and the base R plot() functions

# The database we will use are a group  of 100 lemurs for which a number of measurements have been carried out 
# in the field. These include height (in cm), age (in months), gender and number of gastrointestinal parasites 
# found in their feces. This is the dataset that was previously used in E2M2 and in the linear regression tutorial 
# last week. We've developed this further to include data that could be used to conduct glms, glmms, and gams. 
# Therefore, these same 100 lemurs were tracked over three years and the same variables above were measured in the 
# last year (2023) an intervention was applied to 50 of the lemurs to ask whether an anti-parasitic drug helped 
# lemurs to clear GI parasite infections.We will explore the relationship between these different variables, and 
# will develop statistical models to quantify their association.

#change to your own working directory. This version is for a Macbook
#setwd('/Users/emilyruhs/Desktop/UChi_Brook_Lab/Coding4Conservation/GLM_2022')

lemur.data <- read.csv('Lemurdata_glm.csv')

#---------------------------------------------------------#
# 1. Data exploration: Plots and summary statistics ------
#---------------------------------------------------------#

# Explore the database #
head(lemur.data) #shows the first 6 entries in the dataset
dim(lemur.data) #300 rows of 9 variables - this is also shown in your environment in the top right
names(lemur.data) #names gives you the names of the variables
summary(lemur.data) #summary tells you the summary statistics (mean, min, quantiles) of each of the variables

str(lemur.data) #str() will tell you how the variables are categorized (numerical, integer, character, factor)
# what I see sis that year and treatment are considered an integar and character, respectively. For the sake of analysis 
# and clarity, I am going to re-categorize these as factors...for example, because year is not an integer
lemur.data$sex <- as.factor(lemur.data$sex)
lemur.data$year <- as.factor(lemur.data$year)
lemur.data$treatment <- as.factor(lemur.data$treatment)


# alternatively for treatment, you could change this to a binomial (0=not treated or 1=treated). 
# QUESTION 1 ----
# How would you re-classify treatment "oui" = 1 and treatment "non" = 0  using code in R??


# okay so now that we have the data loaded and the structure of the data the way that we want it, we can begin to explore 
# how the data look. 

# access the variables in the dataframe without having to call the dataframe
# i.e. can print 'height' instead of 'lemur.data$height'
attach(lemur.data)
# detach(lemur.data)

hist(GIparasites, col='grey')
# QUESTION 2 ----
# Are these data normally distributed?

boxplot(GIparasites, ylab='Number of GI parasites')
#but these plots don't take into account the many other variables and factors in the dataset. 
boxplot(GIparasites ~ sex, col = 'grey') #males might have slightly more GIparasites

# Moving forward, I am going to switch to ggplot2, which is what I am more comfortable working in. There is a great
#"cheatsheet" for ggplot2 here:
# https://www.maths.usyd.edu.au/u/UG/SM/STAT3022/r/current/Misc/data-visualization-2.1.pdf

# what does the distribution of the data look for number of GI parasites for males and females separately?
ggplot(data=lemur.data) +
  geom_histogram(aes(x=GIparasites, fill=sex)) + theme_classic()
  #scale_fill_manual(values=c("#69b3a2", "#404080")) # this would allow you to pick the colors in your graph 
# check out more colors here: (http://www.cookbook-r.com/Graphs/Colors_(ggplot2)/)
# what we see is that the GI parasites are not normally distributed, and instead follow a poisson distribution

# we can confirm that the data are not normal by checking the qqplot
qqnorm(GIparasites)
qqline(GIparasites, col="blue")

#--------------------------------------------#
#   2. Generalized linear models: GLM -------
#---------------------------------------------#

# there are a number of ways you could treat these data in terms of repeated measures. For example: The individual 
# lemurs are sampled four times over four different years. There are also 10 lemur families that are sampled four times 
# over four different years. We also have multiple families from three different spatial locations.

# QUESTION 3 ----
# if your data looks similar to this dataset, your first question should be, "what am I interested is?" or 
# "what is my main research question?"

# let's first ask how do the number of GI parasites change over a given year? We probably want to exclude the 
#intervention year (2023)
new.dat <- subset(lemur.data,year!='2022') #you should now have only 300 observations

ggplot(data=new.dat, aes(x=year, y=GIparasites))+
  geom_boxplot() + theme_classic() + xlab("\nYear") + ylab("Number of GI parasites\n")

# they don't seem very different by year, but let's confirm this with some models:
lm1 <- lm(GIparasites ~ year, data=new.dat)
summary(lm1)
m1 <- glm(GIparasites ~ year, data=new.dat)
summary(m1)

# QUESTION 4 ----
# Do you remember why these two model outputs are the same? Revisit the lecture and the assumptions that are taking place

#Okay so now let's conduct the real model for the question of whether the number of GI parasites change over a given year?
m1 <- glm(GIparasites ~ year, family=poisson, data=new.dat)
summary(m1)
# so there are differences by year. In a summary output where you have more than two groups that you are comparing
# they will show the values for all groups except the first (that is ordered, i.e. 2018 is the first year). This means
# the glm is comparing all groups to the first group in the dataset.

#what about sex? Does this impact the number of GI parasites?
ggplot(data=new.dat, aes(x=sex, y=GIparasites))+
  geom_boxplot() + theme_classic() + xlab("\nSex") + ylab("Number of GI parasites\n")

# then let's separate this by sex and year
ggplot(data=new.dat, aes(x=sex, y=GIparasites, fill=year))+
  geom_boxplot()+ theme_classic() + xlab("\nSex") + ylab("Number of GI parasites\n")

# we alredy know there are effects of year, but it looks like they might be different by sex, 
# but let's again confirm with a glm
m2 <- glm(GIparasites ~ sex, family=poisson, data=new.dat)
summary(m2)
# looks like males have more parasites. However, we also might also have apriori reasons to believe 
# that male lemurs do not have more GIparasites OR we really just aren't interested in the sex effects, but
# know we need to control for it's effect.

# this glm ask whether there are interaction effects, as noted by the '*'. So we are asking: Are there 
# differences in GI parasites by sex and across years? In this example, we would be breaking the data into
# six different groups and ask if those above groups are statistically different
m3 <- glm(GIparasites ~ sex*year, family=poisson, data=new.dat)
summary(m3)
#so there is an effect of sex by year

#----------------------------------------------------#
#   3. Generalized linear mixed models: GLMM -------
#----------------------------------------------------#

# okay so now we've established that there are effects of sex and year on the dataset, but these might not be the 
# main focus of why we collected that data. First my main research question is if certain sites naturally have 
# lemurs with more parasites. Let's assume this might be because the sites are along an urbanization gradient 
# and some are more rural than others.
ggplot(data=new.dat, aes(x=location, y=GIparasites, fill=location))+
  geom_boxplot()+ theme_classic() + xlab("\nLocation") + ylab("Number of GI parasites\n")
# a small note, if you do not want your legend to the side of this graph you can add: '+theme(legend.position="none")' to the end of your code

# now that we see the data, let's use a glmm to ask whether the sites are different, but I want to control for year 
# and individual ID!
m4 <- glmer.nb(GIparasites ~ location + sex + (1|year) + (1|ID), data=new.dat)
summary(m4)
# okay sex is significant, but are there differences by location and sex?
m5 <- glmer.nb(GIparasites ~ location*sex + (1|year)+ (1|ID), data=new.dat)
summary(m5)

# But if your groups are different, which ones?
# in this instance, we might want to run a post-hoc test. We need to load a new package to look at post-hoc tests
library(multcomp)
summary(glht(m5, mcp(location="Tukey")))

# QUESTION 5 ----
# what groups are different?

#let's plot this just for fun!
ggplot(data=new.dat, aes(x=location, y=GIparasites, fill=sex))+
  geom_boxplot()+ theme_classic() + xlab("\nLocation") + ylab("Number of GI parasites\n")

# Okay let's look at one more example of a glm. This time, I want to know if the intervention worked
# in 2022. If you remember, in this example, half of the lemurs we were tracking (n=50) were given 
# a medication to see if it cleared or reduced their GI parasites. 

#so let's switch back to the whole dataset, "lemur.data" that has all data observations
ggplot(data=lemur.data, aes(x=year, y=GIparasites))+
  geom_boxplot()+ theme_classic() + xlab("\nLocation") + ylab("Number of GI parasites\n")
#it does look like there might be fewer parasites in 2022!

library(dplyr) # let's load this package that allows us to manipulate data. 
lemur.data<-
  lemur.data %>%
  mutate(treatment = case_when(
  as.character(treatment) %in% c("Non") ~ "0",
  as.character(treatment) %in% c("Oui") ~ "1",
  TRUE ~ as.character(treatment)
)
)
# change treatment to a factor
lemur.data$treatment <- as.factor(lemur.data$treatment)

ggplot(data=lemur.data, aes(x=treatment, y=GIparasites, fill=sex))+
  geom_boxplot()+ theme_classic() + xlab("\nTreatment") + ylab("Number of GI parasites\n")

# Again, we're not super interested in sex effects, so let's collapse those groups
ggplot(data=lemur.data, aes(x=treatment, y=GIparasites))+
  geom_boxplot()+ theme_classic() + xlab("\nTreatment") + ylab("Number of GI parasites\n")

# So let's run a model
m6 <- glmer.nb(GIparasites ~ treatment + (1|sex) + (1|ID), data=lemur.data)
summary(m6)

# QUESTION 6 ----
# did the treatment work? 


# QUESTION 7 -----
# what are some other variables in the dataset that you might want to set as "random"?


#-------------------------------------#
#   4. Generalized additive models: GAMs ------
#--------------------------------------#
lemur.data$year <- as.numeric(lemur.data$year)

# Okay, now what if we want to test out to see how GI parasites change over time
# We can use GAMs to get a clearer picture of what this continuous predicted pattern might look like

gam1 <- gam(GIparasites ~ s(as.numeric(year), k=4,bs="tp"),  data=lemur.data, family="gaussian")
summary(gam1) #use this to get your summary table
gam.check(gam1) #use this to check your gam residual plots

pred.df <- dplyr::select(lemur.data, year)
pred.df$predicted_count <- predict.gam(gam1, type = "response", se.fit = T)$fit
pred.df$predicted_count_se <- predict.gam(gam1, type = "response", se.fit = T)$se
pred.df$predicted_count_lci <- pred.df$predicted_count -1.96*pred.df$predicted_count_se
pred.df$predicted_count_uci <- pred.df$predicted_count +1.96*pred.df$predicted_count_se

dev.new()
ggplot(data=lemur.data) + geom_point(aes(x=year, y=GIparasites), alpha=.6, show.legend = F)+
  theme_classic()+ ylab("GI parasites\n") + xlab("\nCollection year") +
  geom_line(data=pred.df, aes(x=year, y=predicted_count), size=1) + 
  geom_ribbon(data=pred.df, aes(x=year, ymin=predicted_count_lci, ymax=predicted_count_uci), alpha=.3)


# now let's plot this pattern by location
gam2 <- gam(GIparasites ~ location + s(as.numeric(year), k=4,bs="tp"),  data=lemur.data, family="gaussian")
pred.df <- dplyr::select(lemur.data, year, location)
#pred.df <- pred.df[complete.cases(pred.df),]
pred.df$predicted_count <- predict.gam(gam2, type = "response", se.fit = T)$fit
pred.df$predicted_count_se <- predict.gam(gam2, type = "response", se.fit = T)$se
pred.df$predicted_count_lci <- pred.df$predicted_count -1.96*pred.df$predicted_count_se
pred.df$predicted_count_uci <- pred.df$predicted_count +1.96*pred.df$predicted_count_se

dev.new()
ggplot(data=lemur.data) + geom_point(aes(x=year, y=GIparasites, color=location, shape=location), alpha=.6, show.legend = F)+
  theme_classic()+ ylab("GI parasites\n") + xlab("\nCollection year") +
  geom_line(data=pred.df, aes(x=year, y=predicted_count, color=location, linetype=location), size=1) + 
  geom_ribbon(data=pred.df, aes(x=year, ymin=predicted_count_lci, ymax=predicted_count_uci, fill=location), alpha=.3)
